Worked by : Ana Menkshi

~ Connection

# Interface for database communication in R

library(DBI)
## Warning: package 'DBI' was built under R version 4.3.3
# Enables connections via ODBC drivers
library(odbc)
## Warning: package 'odbc' was built under R version 4.3.3
con <- dbConnect(odbc(),
                 Driver = "ODBC Driver 17 for SQL Server",
                 Server = "Ana",
                 Database = "Book",
                 Trusted_Connection = "Yes",
                 Port = 1433)


# Disconnect when done

#dbDisconnect(con)

~Data cleaning

# Load the books table
books <- dbReadTable(con, "books")

# 1. Display the first 5 rows
head(books, 5)
##   id book_id work_id books_count      isbn       isbn13
## 1  1 2767052 2792775         272 439023483 9.780438e+12
## 2  2       3 4640799         491 439554934 9.780440e+12
## 3  3   41865 3212258         226 316015849 9.780316e+12
## 4  4    2657 3275794         487  61120081 9.780061e+12
## 5  5    4671  245494        1356 743273567 9.780744e+12
##                        authors original_publication_year
## 1              Suzanne Collins                      2008
## 2 J.K. Rowling, Mary GrandPré                      1997
## 3              Stephenie Meyer                      2005
## 4                   Harper Lee                      1960
## 5          F. Scott Fitzgerald                      1925
##                             original_title
## 1                         the hunger games
## 2 harry potter and the philosopher's stone
## 3                                 twilight
## 4                    to kill a mockingbird
## 5                         the great gatsby
##                                                      title language_code
## 1                  The Hunger Games (The Hunger Games, #1)           eng
## 2 Harry Potter and the Sorcerer's Stone (Harry Potter, #1)           eng
## 3                                  Twilight (Twilight, #1)         en-US
## 4                                    To Kill a Mockingbird           eng
## 5                                         The Great Gatsby           eng
##   average_rating ratings_count work_ratings_count work_text_reviews_count
## 1           4.34       4780653            4942365                  155254
## 2           4.44       4602479            4800065                   75867
## 3           3.57       3866839            3916824                   95009
## 4           4.25       3198671            3340896                   72586
## 5           3.89       2683664            2773745                   51992
##   ratings_1 ratings_2 ratings_3 ratings_4 ratings_5
## 1     66715    127936    560092   1481305   2706317
## 2     75504    101676    455024   1156318   3011543
## 3    456191    436802    793319    875073   1355439
## 4     60427    117415    446835   1001952   1714267
## 5     86236    197621    606158    936012    947718
##                                                    image_url
## 1 https://images.gr-assets.com/books/1447303603m/2767052.jpg
## 2       https://images.gr-assets.com/books/1474154022m/3.jpg
## 3   https://images.gr-assets.com/books/1361039443m/41865.jpg
## 4    https://images.gr-assets.com/books/1361975680m/2657.jpg
## 5    https://images.gr-assets.com/books/1490528560m/4671.jpg
##                                              small_image_url
## 1 https://images.gr-assets.com/books/1447303603s/2767052.jpg
## 2       https://images.gr-assets.com/books/1474154022s/3.jpg
## 3   https://images.gr-assets.com/books/1361039443s/41865.jpg
## 4    https://images.gr-assets.com/books/1361975680s/2657.jpg
## 5    https://images.gr-assets.com/books/1490528560s/4671.jpg
##                    category age_group
## 1        Real-World Fiction     adult
## 2 Fantasy/Adventure Fiction     young
## 3        Real-World Fiction     young
## 4 Fantasy/Adventure Fiction     adult
## 5        Real-World Fiction     young
# 2. See table dimensions (rows and columns)
dim(books)  # returns (rows, columns)
## [1] 9447   24
# 3. List all column names
colnames(books)
##  [1] "id"                        "book_id"                  
##  [3] "work_id"                   "books_count"              
##  [5] "isbn"                      "isbn13"                   
##  [7] "authors"                   "original_publication_year"
##  [9] "original_title"            "title"                    
## [11] "language_code"             "average_rating"           
## [13] "ratings_count"             "work_ratings_count"       
## [15] "work_text_reviews_count"   "ratings_1"                
## [17] "ratings_2"                 "ratings_3"                
## [19] "ratings_4"                 "ratings_5"                
## [21] "image_url"                 "small_image_url"          
## [23] "category"                  "age_group"
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# 4. Search for duplicates

books <- distinct(books)

#  Convert original_publication_year to integer
books$original_publication_year <- as.integer(books$original_publication_year)

# 5 Count NA and empty string values per column
na_empty_counts <- sapply(books, function(col) sum(is.na(col) | col == ""))
na_empty_df <- data.frame(Column = names(na_empty_counts), MissingOrEmpty = na_empty_counts)
na_empty_df <- na_empty_df[na_empty_df$MissingOrEmpty > 0, ]  # Show only affected columns

# Print columns with NA or empty values
print("Columns with missing or empty values:")
## [1] "Columns with missing or empty values:"
print(na_empty_df)
##                                              Column MissingOrEmpty
## isbn                                           isbn            560
## isbn13                                       isbn13            458
## original_publication_year original_publication_year              6
## language_code                         language_code           1006
# 5.1. Check for full row duplicates
duplicate_rows <- books[duplicated(books), ]

# 5.2. Show the number of duplicate rows
num_duplicates <- nrow(duplicate_rows)
cat("Number of duplicate rows in the 'books' table:", num_duplicates, "\n")
## Number of duplicate rows in the 'books' table: 0
# 6. Create a new column 'work_rating' as the sum of ratings and text reviews
books$work_rating <- books$work_ratings_count + books$work_text_reviews_count

# Preview the new column
head(books[, c("work_ratings_count", "work_text_reviews_count", "work_rating")])
##   work_ratings_count work_text_reviews_count work_rating
## 1            4942365                  155254     5097619
## 2            4800065                   75867     4875932
## 3            3916824                   95009     4011833
## 4            3340896                   72586     3413482
## 5            2773745                   51992     2825737
## 6            2478609                  140739     2619348
# 7. Remove columns: isbn, isbn13, image_url, small_image_url
books <- books %>%
  select(-isbn, -isbn13, -image_url, -small_image_url, -language_code, -original_publication_year)

# Show cleaned table structure
str(books)
## 'data.frame':    9434 obs. of  19 variables:
##  $ id                     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ book_id                : int  2767052 3 41865 2657 4671 11870085 5907 5107 960 1885 ...
##  $ work_id                : int  2792775 4640799 3212258 3275794 245494 16827462 1540236 3036731 3338963 3060926 ...
##  $ books_count            : int  272 491 226 487 1356 226 969 360 311 3455 ...
##  $ authors                : chr  "Suzanne Collins" "J.K. Rowling, Mary GrandPré" "Stephenie Meyer" "Harper Lee" ...
##  $ original_title         : chr  "the hunger games" "harry potter and the philosopher's stone" "twilight" "to kill a mockingbird" ...
##  $ title                  : chr  "The Hunger Games (The Hunger Games, #1)" "Harry Potter and the Sorcerer's Stone (Harry Potter, #1)" "Twilight (Twilight, #1)" "To Kill a Mockingbird" ...
##  $ average_rating         : num  4.34 4.44 3.57 4.25 3.89 ...
##  $ ratings_count          : int  4780653 4602479 3866839 3198671 2683664 2346404 2071616 2044241 2001311 2035490 ...
##  $ work_ratings_count     : int  4942365 4800065 3916824 3340896 2773745 2478609 2196809 2120637 2078754 2191465 ...
##  $ work_text_reviews_count: int  155254 75867 95009 72586 51992 140739 37653 44920 25112 49152 ...
##  $ ratings_1              : int  66715 75504 456191 60427 86236 47994 46023 109383 77841 54700 ...
##  $ ratings_2              : int  127936 101676 436802 117415 197621 92723 76784 185520 145740 86485 ...
##  $ ratings_3              : int  560092 455024 793319 446835 606158 327550 288649 455042 458429 284852 ...
##  $ ratings_4              : int  1481305 1156318 875073 1001952 936012 698471 665635 661516 716569 609755 ...
##  $ ratings_5              : int  2706317 3011543 1355439 1714267 947718 1311871 1119718 709176 680175 1155673 ...
##  $ category               : chr  "Real-World Fiction" "Fantasy/Adventure Fiction" "Real-World Fiction" "Fantasy/Adventure Fiction" ...
##  $ age_group              : chr  "adult" "young" "young" "adult" ...
##  $ work_rating            : int  5097619 4875932 4011833 3413482 2825737 2619348 2234462 2165557 2103866 2240617 ...

~Exploratory Data Analysis

Thesis: Older age groups prefer Real-World Fiction books the most.

Antithesis: Genre preference is not strictly determined by age.

library(dplyr)
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.3
# Reshape the 5 rating columns into long format
ratings_long <- books %>%
  select(original_title, ratings_1, ratings_2, ratings_3, ratings_4, ratings_5) %>%
  pivot_longer(
    cols = starts_with("ratings_"),
    names_to = "rating_level",
    values_to = "count"
  ) %>%
  mutate(rating_level = gsub("ratings_", "", rating_level))  # Clean up names
  1. Graphic that shows the number of ratings from 1 to 5.
library(ggplot2)
library(scales)
## Warning: package 'scales' was built under R version 4.3.3
ggplot(ratings_long, aes(x = rating_level, y = count)) +
  geom_jitter(alpha = 0.3, color = "#8e44ad", width = 0.2) +
  scale_y_continuous(labels = comma, breaks = seq(0, max(ratings_long$count), by = 500000)) +
  labs(title = "Scatter Plot of Book Ratings (1–5 Stars)",
       x = "Rating Level", y = "Number of Ratings") +
  theme_minimal(base_size = 14) +
  theme(plot.title = element_text(hjust = 0.5,face="bold"))

In the graph above, we understand that the majority of the review activity has been predominantly positive.

  1. Distribution of Real -World Fiction vs Fantasy Fiction
library(ggplot2)

ggplot(books, aes(x = category, fill = category)) +
  geom_bar() +
  labs(title = "Distribution of Real-World Fiction vs Fantasy Fiction",
       x = "Book Category", y = "Number of Titles") +
  theme_minimal() +
  theme(legend.position = "none")

Here we see that the Real-World Fiction has more title of books than Fantasy/Adventure Fiction.

  1. Age Group vs Category (Cross Tab)
table(books$age_group, books$category)
##        
##         Fantasy/Adventure Fiction Real-World Fiction
##   adult                       735               4881
##   young                      1721               2097

4 & 5, are the same theory, only changes the graphical view.

  1. Heatmap of Age Group vs Category
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.3.3
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
# Create a crosstab and convert it to a data frame
cross_tab <- table(books$age_group, books$category)
df_tab <- as.data.frame(cross_tab)
colnames(df_tab) <- c("age_group", "category", "count")

# Heatmap
ggplot(df_tab, aes(x = category, y = age_group, fill = count)) +
  geom_tile() +
  geom_text(aes(label = count), color = "white") +
  scale_fill_gradient(low = "#add8e6", high = "#003366") +
  labs(title = " Age Group vs Book Category Heatmap",
       x = "Category", y = "Age Group") +
  theme_minimal()

Antithesis –– A possible defense of the antithesis is the fact that the ‘young’ category also shows the same tendency, with a very small gap between the number of titles read in each genre. This can also be attributed to the fact that there are more distinct book titles available in the Real-World Fiction category, which implies that the same logic could apply to the adult category as well — and thus, the difference may not necessarily reflect true preference.

Argument for Thesis –– In this heatmap, we see that the majority of Real-World Fiction titles have been read by the ‘adult’ age group, which implies that this age group prefers this genre more. Furthermore, there is a noticeable gap between the number of adults reading Real-World Fiction and those reading the other genre, clearly surpassing the latter.

5.Mosaic Plot: Age Group vs Fiction Category

# install.packages("ggmosaic")

library(ggmosaic)
## Warning: package 'ggmosaic' was built under R version 4.3.3
ggplot(data = books) +
  geom_mosaic(aes(x = product(age_group), fill = category), na.rm = TRUE) +
  labs(title = "Mosaic Plot: Age Group vs Fiction Category",
       x = "Age Group", y = "Proportion", fill = "Category") +
  scale_fill_manual(values = c("#f7c6c7", "#f49ac2")) +
  theme_minimal(base_size = 14) +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))
## Warning: The `scale_name` argument of `continuous_scale()` is deprecated as of ggplot2
## 3.5.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `trans` argument of `continuous_scale()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use the `transform` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: `unite_()` was deprecated in tidyr 1.2.0.
## ℹ Please use `unite()` instead.
## ℹ The deprecated feature was likely used in the ggmosaic package.
##   Please report the issue at <https://github.com/haleyjeppson/ggmosaic>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Antithesis –– A possible defense of the antithesis is the fact that the ‘young’ category also shows the same tendency, with a very small gap between the number of titles read in each genre. This can also be attributed to the fact that there are more distinct book titles available in the Real-World Fiction category, which implies that the same logic could apply to the adult category as well — and thus, the difference may not necessarily reflect true preference.

Counterargument –– It’s true that the number of titles in the Real-World Fiction category is higher, but we can clearly see that the gap between the two genres in the adult category is significantly larger.

  1. Average Rating by Age Group and Category
# Antithesis
ggplot(books, aes(x = age_group, y = average_rating, fill = category)) +
  geom_boxplot() +
  labs(title = "Average Rating by Age Group and Category",
       x = "Age Group", y = "Average Rating") +
  theme_minimal()

Antithesis argument—- Here we observe that the average ratings are higher for Fantasy/Adventure Fiction. This supports the idea that genre preference is not entirely dependent on age, but also on the personal phase the reader is going through, regardless of age. For example, an adult reader going through a psychologically difficult period may feel the need to read something lighter and easier to digest, which is why they may choose the Fantasy genre.

Counterargument –— The difference between the averages is very small, almost negligible. The results are also influenced by the presence of outliers — a few extremely low ratings are enough to bring down the overall average, even if the majority of ratings are high. Additionally, the library contains more books from the Real-World Fiction genre, meaning we have a larger sample size for this category compared to Fantasy. Moreover, the idea of preference is often tied to adults’ general perception of the genre, not necessarily because every book they read from that genre is rated highly by them.

~ Data Analysis

Adults who read Real-World Fiction

sum(books$age_group == "adult" & books$category == "Real-World Fiction")
## [1] 4881

Adults who read Fantasy/Adventure Fiction

sum(books$age_group == "adult" & books$category == "Fantasy/Adventure Fiction")
## [1] 735

Comparison Plot

library(ggplot2)
# Done previously for thesis
# Step 1: Count the number of books in each group
adult_real    <- sum(books$age_group == "adult" & books$category == "Real-World Fiction")
adult_fantasy <- sum(books$age_group == "adult" & books$category == "Fantasy/Adventure Fiction")
young_real    <- sum(books$age_group == "young" & books$category == "Real-World Fiction")
young_fantasy <- sum(books$age_group == "young" & books$category == "Fantasy/Adventure Fiction")

# Step 2: Create a data frame
compare_df <- data.frame(
  age_group = rep(c("Adult", "Young"), each = 2),
  category = rep(c("Real-World Fiction", "Fantasy/Adventure Fiction"), 2),
  count = c(adult_real, adult_fantasy, young_real, young_fantasy)
)

# Step 3: Create a grouped bar chart with pretty pink palette
ggplot(compare_df, aes(x = category, y = count, fill = age_group)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.7), width = 0.6) +
  scale_fill_manual(values = c("Adult" = "#f7c6c7", "Young" = "#f49ac2")) +
  labs(title = " Reader Preferences by Age Group and Category",
       x = "Book Category", y = "Number of Books", fill = "Age Group") +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
   
  ) +
  geom_text(aes(label = count), 
            position = position_dodge(width = 0.7), 
            vjust = -0.5, size = 4)

Adults give high ratings to Fantasy Fiction

mean(books$average_rating[books$age_group == "adult" & books$category == "Fantasy/Adventure Fiction"], 
     na.rm = TRUE)
## [1] 4.025306

Adults give lower ratings to Real-Life Fiction

mean(books$average_rating[books$age_group == "adult" & books$category == "Real-World Fiction"], 
     na.rm = TRUE)
## [1] 3.988982

Comparison Graph

#Thesis
   library(ggplot2)

# Calculate mean ratings
avg_fantasy <- mean(books$average_rating[books$age_group == "adult" & books$category == "Fantasy/Adventure Fiction"], na.rm = TRUE)
avg_real <- mean(books$average_rating[books$age_group == "adult" & books$category == "Real-World Fiction"], na.rm = TRUE)

# Create a data frame
rating_df <- data.frame(
  category = c("Fantasy/Adventure Fiction", "Real-World Fiction"),
  avg_rating = c(avg_fantasy, avg_real)
)

# Plot with emerald tones
ggplot(rating_df, aes(x = category, y = avg_rating, fill = category)) +
  geom_bar(stat = "identity", width = 0.5, show.legend = FALSE) +
  scale_fill_manual(values = c("#50c878", "#2e8b57")) + 
  labs(title = " Average Rating by Adult Readers",
       x = "Book Category", y = "Average Rating") +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    
  ) +
  ylim(0, 5) +  
  geom_text(aes(label = round(avg_rating, 2)), vjust = -0.5, size = 5)

Argument—Here we see that the average rating in both cases, when rounded to the nearest whole number, is approximately 4. This suggests that the Real-World Fiction titles read also have a high rating.

Young who read Fantasy/Adventure Fiction

mean(books$average_rating[books$age_group == "young" & books$category == "Fantasy/Adventure Fiction"], 
     na.rm = TRUE)
## [1] 4.021464

Young who read Real-World Fiction

mean(books$average_rating[books$age_group == "young" & books$category == "Real-World Fiction"], 
     na.rm = TRUE)
## [1] 3.993071

Combined Comparison Chart (Young vs Adult)

Create the Data Frame combined_df

# Create combined_df manually from your mean calculations
combined_df <- data.frame(
  age_group = c("Young", "Young", "Adult", "Adult"),
  category = c("Fantasy/Adventure Fiction", "Real-World Fiction", 
               "Real-World Fiction", "Fantasy/Adventure Fiction"),
  avg_rating = c(
    mean(books$average_rating[books$age_group == "young" & books$category == "Fantasy/Adventure Fiction"], na.rm = TRUE),
    mean(books$average_rating[books$age_group == "young" & books$category == "Real-World Fiction"], na.rm = TRUE),
    mean(books$average_rating[books$age_group == "adult" & books$category == "Real-World Fiction"], na.rm = TRUE),
    mean(books$average_rating[books$age_group == "adult" & books$category == "Fantasy/Adventure Fiction"], na.rm = TRUE)
  )
)
# Create grouped bar chart
# Thesis

ggplot(combined_df, aes(x = category, y = avg_rating, fill = age_group)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.7), width = 0.6) +
  scale_fill_manual(values = c("Young" = "#cda4de", "Adult" = "#50c878")) +  # violet and emerald
  labs(title = "Average Rating Comparison by Age Group and Category",
       x = "Book Category", y = "Average Rating", fill = "Age Group") +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
   
  ) +
  ylim(0, 5) +
  geom_text(aes(label = round(avg_rating, 2)),
            position = position_dodge(width = 0.7),
            vjust = -0.5, size = 4)

Argument—We observe the same trend here as well, considering the fact that the Real-World Fiction category also contains more outlier values in the ratings, which directly affects the result.

library(ggplot2)
library(dplyr)
library(scales)
#Thesis

# Step 1: Summarize
summary_df <- books %>%
  group_by(age_group, category) %>%
  summarise(total_work_rating = sum(work_rating, na.rm = TRUE)) %>%
  mutate(label = paste(age_group, "–", category))  # custom x-axis labels
## `summarise()` has grouped output by 'age_group'. You can override using the
## `.groups` argument.
# Step 2: Radial chart with readable full labels
ggplot(summary_df, aes(x = label, y = total_work_rating, fill = age_group)) +
  geom_bar(stat = "identity") +
  coord_polar() +
  labs(title = " Radial View: Work Rating by Age Group and Category",
       x = "", y = "") +
  scale_y_continuous(labels = comma) +
  scale_fill_manual(values = c("adult" = "#50c878", "young" = "#cda4de")) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    axis.text.x = element_text(size = 6, color = "black")
  )

Argument – Here it is quite clear that the Real-World Fiction category has received the highest number of reviews and ratings, and this trend has come particularly from the ‘adult’ age group, which has contributed a greater number of ratings.

library(dplyr)
#Thesis

# Summarize total ratings_1 and ratings_5 per group
rating_summary <- books %>%
  group_by(age_group, category) %>%
  summarise(
    total_rating_1 = sum(ratings_1, na.rm = TRUE),
    total_rating_5 = sum(ratings_5, na.rm = TRUE)
  )
## `summarise()` has grouped output by 'age_group'. You can override using the
## `.groups` argument.
# View summary
print(rating_summary)
## # A tibble: 4 × 4
## # Groups:   age_group [2]
##   age_group category                  total_rating_1 total_rating_5
##   <chr>     <chr>                              <int>          <int>
## 1 adult     Fantasy/Adventure Fiction        1122821       21229147
## 2 adult     Real-World Fiction               6241617      107814254
## 3 young     Fantasy/Adventure Fiction        2472688       53555354
## 4 young     Real-World Fiction               3246421       47939506

Use tidyr::pivot_longer() to reshape the data for easy plotting.

library(dplyr)
library(tidyr)

# Summarize ratings and drop grouping afterwards
rating_summary <- books %>%
  group_by(age_group, category) %>%
  summarise(
    rating_1 = sum(ratings_1, na.rm = TRUE),
    rating_5 = sum(ratings_5, na.rm = TRUE),
    .groups = "drop"  #  fixes the warning
  ) %>%
  pivot_longer(cols = c(rating_1, rating_5),
               names_to = "rating_type",
               values_to = "count") %>%
  mutate(rating_type = recode(rating_type,
                              "rating_1" = "1 Star",
                              "rating_5" = "5 Star"))

Graphic for 1 Rating and 5 Rating , for each category, by each age_group. Thesis

library(ggplot2)
library(dplyr)

#  Ensure correct order
rating_summary$rating_type <- factor(rating_summary$rating_type, levels = c("1 Star", "5 Star"))

ggplot(rating_summary, aes(x = "", y = count, fill = rating_type)) +
  geom_bar(stat = "identity", width = 1, color = "white") +
  coord_polar(theta = "y") +
  facet_grid(age_group ~ category) +
  scale_fill_manual(values = c("1 Star" = "#f08080", "5 Star" = "#7bd389")) +
  labs(title = "Proportion of 1★ vs 5★ Ratings by Age Group and Category",
       fill = "Star Rating") +
  theme_void(base_size = 12) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
    strip.text = element_text(face = "bold", size = 11),
    legend.title = element_text(face = "bold")
  )

Argument – This ultimately confirms the validity of our thesis, as we see that the Real-World Fiction genre has received the highest number of 5-star ratings. Considering that this genre also had the highest number of reviews, this solidifies the conclusion that adults prefer this genre the most—perhaps because it resonates more with their lifestyle and way of thinking. It is also noteworthy that for the Real-World Fiction category, adults tend to give either 1-star or 5-star ratings. This explains the earlier graphical representations where we observed a similar average between the two genres, as this pattern is clearly influenced by outlier values.

library(plotly)
## Warning: package 'plotly' was built under R version 4.3.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# Interactive scatter plot
plot_ly(
  data = books,
  x = ~work_rating,
  y = ~average_rating,
  type = 'scatter',
  mode = 'markers',
  color = ~age_group,
  symbol = ~category,
  text = ~paste("Title:", original_title,
                "<br>Age Group:", age_group,
                "<br>Category:", category,
                "<br>Avg Rating:", round(average_rating, 2),
                "<br>Total Engagement:", work_rating),
  hoverinfo = 'text',
  marker = list(size = 6, opacity = 0.6)
) %>%
  layout(
    title = "Work Rating vs. Average Rating by Age Group and Category",
    xaxis = list(title = "Total Work Rating"),
    yaxis = list(title = "Average Rating", range = c(0, 5)),
    legend = list(title = list(text = "<b>Age Group & Category</b>"))
  )
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

Colors = age group, shapes = category